In [1]:
import numpy as np
import pandas as pd


#Data Reader from Internet
#pip install pandas-datareader
    
import matplotlib.pyplot as plt
import seaborn as sns

#visualization inside Jupyter Notebook
%matplotlib inline

#display image in Jupyter Notebook
from IPython.display import Image

###################
# Interative plots
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

# get version
from plotly import __version__
#print(__version__)

import cufflinks as cf

# For Notebooks
init_notebook_mode(connected=True) 
# For offline use
cf.go_offline()

#[[Plotly "after May 2020"
# + pip install chart-studio
#import chart-studio.plotly as py

###################
#Machine Learning (pip install scikit-learn)
#from sklearn.la_famille_de_modeles import le_modele

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression

from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import RandomForestClassifier

from sklearn.svm import SVC

from sklearn.metrics import classification_report, confusion_matrix
#from sklearn.metrics import confusion_matrix


#from sklearn.datasets import load_boston #deprecated
# BUT other available (load_breast_cancer, l....)
In [ ]:
# MatPlotLib rearrange display
fig.tight_layout() 
# or 
plt.tight_layout() 
In [ ]:
# SeaBorn Load dataset
tips = sns.load_dataset('tips')
In [5]:
train = pd.read_csv('datas/titanic_train.csv') #index_col=0)
In [44]:
train # 891 x 12
Out[44]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 S
... ... ... ... ... ... ... ... ... ... ... ...
886 887 0 2 Montvila, Rev. Juozas male 27.0 0 0 211536 13.0000 S
887 888 1 1 Graham, Miss. Margaret Edith female 19.0 0 0 112053 30.0000 S
888 889 0 3 Johnston, Miss. Catherine Helen "Carrie" female 24.0 1 2 W./C. 6607 23.4500 S
889 890 1 1 Behr, Mr. Karl Howell male 26.0 0 0 111369 30.0000 C
890 891 0 3 Dooley, Mr. Patrick male 32.0 0 0 370376 7.7500 Q

891 rows × 11 columns

In [4]:
train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
In [5]:
train.describe()
Out[5]:
PassengerId Survived Pclass Age SibSp Parch Fare
count 891.000000 891.000000 891.000000 714.000000 891.000000 891.000000 891.000000
mean 446.000000 0.383838 2.308642 29.699118 0.523008 0.381594 32.204208
std 257.353842 0.486592 0.836071 14.526497 1.102743 0.806057 49.693429
min 1.000000 0.000000 1.000000 0.420000 0.000000 0.000000 0.000000
25% 223.500000 0.000000 2.000000 20.125000 0.000000 0.000000 7.910400
50% 446.000000 0.000000 3.000000 28.000000 0.000000 0.000000 14.454200
75% 668.500000 1.000000 3.000000 38.000000 1.000000 0.000000 31.000000
max 891.000000 1.000000 3.000000 80.000000 8.000000 6.000000 512.329200
In [48]:
##################
# show null datas
##################
sns.heatmap(train.isnull(),yticklabels=False,cbar=False,cmap='viridis')
Out[48]:
<AxesSubplot: >
In [ ]:
####################################
# drop a column
####################################
train.drop('Cabin',axis=1 #axis=0 for rows, axis=1 for columns
          ,inplace=True) #inplace to apply on data train (not a copy)
train # 891 x 11
In [47]:
####################################
# fill missing values (here Age replaced by mean age in Pclass)
####################################
def impute_age(cols):
    Age = cols[0]
    Pclass = cols[1]
    
    if pd.isnull(Age):

        if Pclass == 1:
            return 37

        elif Pclass == 2:
            return 29

        else:
            return 24

    else:
        return Age
    
train['Age'] = train[['Age','Pclass']].apply(impute_age,axis=1)
train
Out[47]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 S
... ... ... ... ... ... ... ... ... ... ... ...
886 887 0 2 Montvila, Rev. Juozas male 27.0 0 0 211536 13.0000 S
887 888 1 1 Graham, Miss. Margaret Edith female 19.0 0 0 112053 30.0000 S
888 889 0 3 Johnston, Miss. Catherine Helen "Carrie" female 24.0 1 2 W./C. 6607 23.4500 S
889 890 1 1 Behr, Mr. Karl Howell male 26.0 0 0 111369 30.0000 C
890 891 0 3 Dooley, Mr. Patrick male 32.0 0 0 370376 7.7500 Q

891 rows × 11 columns

In [49]:
####################################
# drop null values (DROPS THE ROW containing null values)
####################################
train.dropna(inplace=True)
train # 889 x 11
Out[49]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 S
... ... ... ... ... ... ... ... ... ... ... ...
886 887 0 2 Montvila, Rev. Juozas male 27.0 0 0 211536 13.0000 S
887 888 1 1 Graham, Miss. Margaret Edith female 19.0 0 0 112053 30.0000 S
888 889 0 3 Johnston, Miss. Catherine Helen "Carrie" female 24.0 1 2 W./C. 6607 23.4500 S
889 890 1 1 Behr, Mr. Karl Howell male 26.0 0 0 111369 30.0000 C
890 891 0 3 Dooley, Mr. Patrick male 32.0 0 0 370376 7.7500 Q

889 rows × 11 columns

In [ ]:
train.drop('male',axis=1 #axis=0 for rows, axis=1 for columns
          ,inplace=True)
In [58]:
train
Out[58]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 S
... ... ... ... ... ... ... ... ... ... ... ...
886 887 0 2 Montvila, Rev. Juozas male 27.0 0 0 211536 13.0000 S
887 888 1 1 Graham, Miss. Margaret Edith female 19.0 0 0 112053 30.0000 S
888 889 0 3 Johnston, Miss. Catherine Helen "Carrie" female 24.0 1 2 W./C. 6607 23.4500 S
889 890 1 1 Behr, Mr. Karl Howell male 26.0 0 0 111369 30.0000 C
890 891 0 3 Dooley, Mr. Patrick male 32.0 0 0 370376 7.7500 Q

889 rows × 11 columns

In [8]:
####################################
# replace non numeric values (short)
####################################
loans = pd.read_csv('datas/loan_data.csv')

#cat_feats = ['purpose']

final_data = pd.get_dummies(loans,columns=['purpose']) # ,drop_first=True
In [7]:
loans['purpose'].unique()
Out[7]:
array(['debt_consolidation', 'credit_card', 'all_other',
       'home_improvement', 'small_business', 'major_purchase',
       'educational'], dtype=object)
In [9]:
final_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9578 entries, 0 to 9577
Data columns (total 20 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   credit.policy               9578 non-null   int64  
 1   int.rate                    9578 non-null   float64
 2   installment                 9578 non-null   float64
 3   log.annual.inc              9578 non-null   float64
 4   dti                         9578 non-null   float64
 5   fico                        9578 non-null   int64  
 6   days.with.cr.line           9578 non-null   float64
 7   revol.bal                   9578 non-null   int64  
 8   revol.util                  9578 non-null   float64
 9   inq.last.6mths              9578 non-null   int64  
 10  delinq.2yrs                 9578 non-null   int64  
 11  pub.rec                     9578 non-null   int64  
 12  not.fully.paid              9578 non-null   int64  
 13  purpose_all_other           9578 non-null   uint8  
 14  purpose_credit_card         9578 non-null   uint8  
 15  purpose_debt_consolidation  9578 non-null   uint8  
 16  purpose_educational         9578 non-null   uint8  
 17  purpose_home_improvement    9578 non-null   uint8  
 18  purpose_major_purchase      9578 non-null   uint8  
 19  purpose_small_business      9578 non-null   uint8  
dtypes: float64(6), int64(7), uint8(7)
memory usage: 1.0 MB
In [5]:
loans.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9578 entries, 0 to 9577
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   credit.policy      9578 non-null   int64  
 1   purpose            9578 non-null   object 
 2   int.rate           9578 non-null   float64
 3   installment        9578 non-null   float64
 4   log.annual.inc     9578 non-null   float64
 5   dti                9578 non-null   float64
 6   fico               9578 non-null   int64  
 7   days.with.cr.line  9578 non-null   float64
 8   revol.bal          9578 non-null   int64  
 9   revol.util         9578 non-null   float64
 10  inq.last.6mths     9578 non-null   int64  
 11  delinq.2yrs        9578 non-null   int64  
 12  pub.rec            9578 non-null   int64  
 13  not.fully.paid     9578 non-null   int64  
dtypes: float64(6), int64(7), object(1)
memory usage: 1.0+ MB
In [59]:
####################################
# replace non numeric values
####################################
pd.get_dummies(train['Sex']) # returns tab with columns = values and rows = true/false (1/0)
# for sex it's male or female, so we can drop first column
pd.get_dummies(train['Sex'],drop_first=True)
# save in tab
sex = pd.get_dummies(train['Sex'],drop_first=True)
# add it to our data
train = pd.concat([train,sex],axis=1)

train
Out[59]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Embarked male
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 S 1
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C 0
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 S 0
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 S 0
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 S 1
... ... ... ... ... ... ... ... ... ... ... ... ...
886 887 0 2 Montvila, Rev. Juozas male 27.0 0 0 211536 13.0000 S 1
887 888 1 1 Graham, Miss. Margaret Edith female 19.0 0 0 112053 30.0000 S 0
888 889 0 3 Johnston, Miss. Catherine Helen "Carrie" female 24.0 1 2 W./C. 6607 23.4500 S 0
889 890 1 1 Behr, Mr. Karl Howell male 26.0 0 0 111369 30.0000 C 1
890 891 0 3 Dooley, Mr. Patrick male 32.0 0 0 370376 7.7500 Q 1

889 rows × 12 columns

In [64]:
train['Embarked'].unique() #unique values in a column
Out[64]:
array(['S', 'C', 'Q'], dtype=object)
In [60]:
pd.get_dummies(train['Embarked'],drop_first=True)
Out[60]:
Q S
0 0 1
1 0 0
2 0 1
3 0 1
4 0 1
... ... ...
886 0 1
887 0 1
888 0 1
889 0 0
890 1 0

889 rows × 2 columns

In [93]:
# save in tab
embarked = pd.get_dummies(train['Embarked'],drop_first=True)
# add it to our data
train = pd.concat([train,embarked],axis=1)
train.drop('Embarked',axis=1,inplace=True)
train
Out[93]:
Survived Pclass Age SibSp Parch Fare male Q S
0 0 3 22.0 1 0 7.2500 1 0 1
1 1 1 38.0 1 0 71.2833 0 0 0
2 1 3 26.0 0 0 7.9250 0 0 1
3 1 1 35.0 1 0 53.1000 0 0 1
4 0 3 35.0 0 0 8.0500 1 0 1
... ... ... ... ... ... ... ... ... ...
886 0 2 27.0 0 0 13.0000 1 0 1
887 1 1 19.0 0 0 30.0000 0 0 1
888 0 3 24.0 1 2 23.4500 0 0 1
889 1 1 26.0 0 0 30.0000 1 0 0
890 0 3 32.0 0 0 7.7500 1 1 0

889 rows × 9 columns

In [92]:
train.drop('Q',axis=1,inplace=True)
train
Out[92]:
Survived Pclass Age SibSp Parch Fare Embarked male
0 0 3 22.0 1 0 7.2500 S 1
1 1 1 38.0 1 0 71.2833 C 0
2 1 3 26.0 0 0 7.9250 S 0
3 1 1 35.0 1 0 53.1000 S 0
4 0 3 35.0 0 0 8.0500 S 1
... ... ... ... ... ... ... ... ...
886 0 2 27.0 0 0 13.0000 S 1
887 1 1 19.0 0 0 30.0000 S 0
888 0 3 24.0 1 2 23.4500 S 0
889 1 1 26.0 0 0 30.0000 C 1
890 0 3 32.0 0 0 7.7500 Q 1

889 rows × 8 columns

In [ ]:
####################################
# dates - timestamps treatment
####################################
df['timeStamp']=pd.to_datetime(df['timeStamp']) # convert stringSeries to Time 


df['Hour']=df['timeStamp'].apply(lambda t : t.hour)
df['Month']=df['timeStamp'].apply(lambda t : t.month)
df['Day of Week']=df['timeStamp'].apply(lambda t : t.dayofweek)
dmap = {0:'Mon',1:'Tue',2:'Wed',3:'Thu',4:'Fri',5:'Sat',6:'Sun'}
df['Day of Week']=df['Day of Week'].map(dmap)
In [ ]:
# fast cross checking datas
sns.pairplot(data=df,hue='COL_NAME',palette='bwr') # parametres bidon

image.png

In [ ]:
 
In [24]:
####################################
# incrusted plot
####################################
fig = plt.figure(figsize=(12,8))
ax1 = fig.add_axes([0,0,1,1])
ax2 = fig.add_axes([.3,.3,.6,.5])
ax1.hist(train['Fare'],bins=30) # histogram of all data
ax2.hist(train[(train['Fare']<60)]['Fare'],bins=50) # incrusted zoom on Fares < 60 more populated
Out[24]:
(array([ 15.,   0.,   0.,   1.,   1.,  25., 247.,  40.,  35.,   9.,  55.,
         10.,  28.,  28.,   5.,   8.,   8.,  19.,   5.,   8.,  10.,  39.,
         28.,  16.,   8.,  18.,  14.,   5.,   6.,   6.,   2.,   2.,   5.,
         10.,   3.,   1.,   0.,   0.,   0.,   7.,   0.,   3.,   2.,  10.,
          8.,   0.,   3.,  13.,   2.,   1.]),
 array([ 0.   ,  1.188,  2.376,  3.564,  4.752,  5.94 ,  7.128,  8.316,
         9.504, 10.692, 11.88 , 13.068, 14.256, 15.444, 16.632, 17.82 ,
        19.008, 20.196, 21.384, 22.572, 23.76 , 24.948, 26.136, 27.324,
        28.512, 29.7  , 30.888, 32.076, 33.264, 34.452, 35.64 , 36.828,
        38.016, 39.204, 40.392, 41.58 , 42.768, 43.956, 45.144, 46.332,
        47.52 , 48.708, 49.896, 51.084, 52.272, 53.46 , 54.648, 55.836,
        57.024, 58.212, 59.4  ]),
 <BarContainer object of 50 artists>)
In [11]:
loans = pd.read_csv('datas/loan_data.csv')
####################################
# 2 histograms on same plot
####################################
b=25
a=0.5

# variable column
col='not.fully.paid'
#col='credit.policy'

plt.figure(figsize=(12,6))
loans[loans[col]==1]['fico'].hist(label=col+'=1',bins=b,alpha=a,color='red')
loans[loans[col]==0]['fico'].hist(label=col+'=0',bins=b,alpha=a,color='blue')
plt.legend()
plt.xlabel('FICO')
Out[11]:
Text(0.5, 0, 'FICO')
In [12]:
loans = pd.read_csv('datas/loan_data.csv')
sns.lmplot(data=loans,x='fico',y='int.rate',
           hue='credit.policy',col='not.fully.paid')
Out[12]:
<seaborn.axisgrid.FacetGrid at 0x2a0603c97b0>
In [14]:
ad_data = pd.read_csv('datas/advertising.csv')
sns.pairplot(data=ad_data,hue='Clicked on Ad',palette='bwr')
Out[14]:
<seaborn.axisgrid.PairGrid at 0x2a061023c10>
In [16]:
####################################
#interactive version
####################################
import cufflinks as cf
cf.go_offline()
In [19]:
train[train['Fare']<60]['Fare'].iplot(kind='hist',bins=50,color='green')
In [11]:
Image(filename='imgs/13-Logistic-Regression--01-Logistic Regression with Python--iplot-hist.png')
Out[11]:
In [25]:
####################################
# Multiple interative plots
####################################
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot


from plotly import __version__
print(__version__)
5.11.0
In [ ]:
#hist1 = go.Histogram(x=train['Fare'],name='All Fares',)
hist1 = go.Bar()
hist2 = go.Histogram(x=train[train['Fare']<60]['Fare'],name='Fare < 60')

fig1 = go.Figure(data=hist1)
fig2 = go.Figure(data=hist2)

figs = cf.subplots([fig1, fig2],shape=(1,2))
iplot(figs)
In [3]:
# http://localhost:8888/notebooks/Refactored_Py_DS_ML_Bootcamp-master/13-Logistic-Regression/02-Logistic%20Regression%20Project.ipynb
# 
ad_data = pd.read_csv('datas/advertising.csv')
sns.jointplot(data=ad_data,x='Age',y='Daily Time Spent on Site',
              kind='kde',color='red',fill=True,
              marginal_kws=dict(alpha=0.1))
Out[3]:
<seaborn.axisgrid.JointGrid at 0x1bcf8378400>
In [13]:
Image(filename='imgs/13-Logistic-Regression--02-Logistic Regression Project--JoinPlot.png')
Out[13]:
In [ ]:
train.drop(['PassengerId','Name','Sex','Ticket'],axis=1,inplace=True)
In [78]:
pd.get_dummies(train['Embarked'],drop_first=True)
Out[78]:
Q S
0 0 1
1 0 0
2 0 1
3 0 1
4 0 1
... ... ...
886 0 1
887 0 1
888 0 1
889 0 0
890 1 0

889 rows × 2 columns

¶

Machine Learning¶

¶

LogisticRegression¶

¶

In [4]:
######## CONVERT LINEAR Regression to LOGISTIC Regression ########

Image('imgs/linear-to-logistic-1.JPG')
Out[4]:

Convert formula¶

$$\phi(z) = \frac {1} {1+ e^{-z}} $$
In [5]:
Image('imgs/linear-to-logistic-2.JPG')
Out[5]:
In [ ]:
######## LogisticRegression ########

#####################################
# Prepare trainings and tests data
#####################################
from sklearn.model_selection import train_test_split
y = train['Survived']
X = train.drop('Survived',axis=1)
X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                    test_size=0.30, 
                                                    random_state=101)

#####################################
# Train model
#####################################
from sklearn.linear_model import LogisticRegression

logmodel = LogisticRegression() # create instance of Logisitc model
logmodel.fit(X=X_train,y=y_train)

#####################################
# Run predictions
#####################################
predictions = logmodel.predict(X_test)

K Nearest Neighbors (KNN)¶

¶

In [7]:
######## K Nearest Neighbors (KNN) ########
df = pd.read_csv('datas/KNN_Project_Data')
#####################################
# Prepare trainings and tests data
#####################################
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

scaler = StandardScaler()
# fill scaler values
scaler.fit(df.drop('TARGET CLASS',axis=1))

#scale features to standardize everything to the same scale
#neither values with large scale will impact differently the values with small scale
scaled_features = scaler.transform(df.drop('TARGET CLASS',axis=1))
#==> returns values around 0 [-1.... .. +1.....]

df_feat = pd.DataFrame(scaled_features,columns=df.columns[:-1])

X=scaled_features
y=df['TARGET CLASS']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.30,random_state=101)

#####################################
# Train model
#####################################
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X=X_train, y=y_train)

#####################################
# Run predictions
#####################################
predictions = knn.predict(X_test)
predictions


#####################################
# Find Best K value
#####################################
error_rate = []
iMaxLoop = 60

for i in range(1,iMaxLoop) :
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train,y_train)
    pred_i = knn.predict(X_test)
    error_rate.append(np.mean(pred_i!=y_test))

    print("**************************")
    print("**classification_report** WITH K=",str(i))
    print(classification_report(y_test,pred_i))
    print("**confusion_matrix** WITH K=",str(i))
    conf_matrix = confusion_matrix(y_test,pred_i)
    print(conf_matrix)
    FalNeg = conf_matrix[0,1]
    FalPos = conf_matrix[1,0]
    #print(confusion_matrix(y_test,pred))
    #FN = confusion_matrix [1,1]
    print("Errors :",str(FalNeg+FalPos))
 
plt.figure(figsize=(10,6))
plt.plot(range(1,iMaxLoop),error_rate,
        color='blue',linestyle='dashed', #linestyle='--',
         marker='o',markerfacecolor='red',markersize=10)
plt.xlabel('K')
plt.ylabel('Error Rate')
plt.title('Error rate vs K value')
**************************
**classification_report** WITH K= 1
              precision    recall  f1-score   support

           0       0.73      0.72      0.72       152
           1       0.71      0.72      0.72       148

    accuracy                           0.72       300
   macro avg       0.72      0.72      0.72       300
weighted avg       0.72      0.72      0.72       300

**confusion_matrix** WITH K= 1
[[109  43]
 [ 41 107]]
Errors : 84
**************************
**classification_report** WITH K= 2
              precision    recall  f1-score   support

           0       0.67      0.85      0.75       152
           1       0.79      0.57      0.66       148

    accuracy                           0.71       300
   macro avg       0.73      0.71      0.70       300
weighted avg       0.73      0.71      0.70       300

**confusion_matrix** WITH K= 2
[[129  23]
 [ 64  84]]
Errors : 87
**************************
**classification_report** WITH K= 3
              precision    recall  f1-score   support

           0       0.80      0.77      0.78       152
           1       0.77      0.80      0.78       148

    accuracy                           0.78       300
   macro avg       0.78      0.78      0.78       300
weighted avg       0.78      0.78      0.78       300

**confusion_matrix** WITH K= 3
[[117  35]
 [ 30 118]]
Errors : 65
**************************
**classification_report** WITH K= 4
              precision    recall  f1-score   support

           0       0.75      0.86      0.80       152
           1       0.83      0.70      0.76       148

    accuracy                           0.78       300
   macro avg       0.79      0.78      0.78       300
weighted avg       0.79      0.78      0.78       300

**confusion_matrix** WITH K= 4
[[130  22]
 [ 44 104]]
Errors : 66
**************************
**classification_report** WITH K= 5
              precision    recall  f1-score   support

           0       0.79      0.80      0.80       152
           1       0.79      0.78      0.79       148

    accuracy                           0.79       300
   macro avg       0.79      0.79      0.79       300
weighted avg       0.79      0.79      0.79       300

**confusion_matrix** WITH K= 5
[[122  30]
 [ 32 116]]
Errors : 62
**************************
**classification_report** WITH K= 6
              precision    recall  f1-score   support

           0       0.76      0.86      0.80       152
           1       0.83      0.72      0.77       148

    accuracy                           0.79       300
   macro avg       0.79      0.79      0.79       300
weighted avg       0.79      0.79      0.79       300

**confusion_matrix** WITH K= 6
[[130  22]
 [ 41 107]]
Errors : 63
**************************
**classification_report** WITH K= 7
              precision    recall  f1-score   support

           0       0.83      0.81      0.82       152
           1       0.81      0.82      0.82       148

    accuracy                           0.82       300
   macro avg       0.82      0.82      0.82       300
weighted avg       0.82      0.82      0.82       300

**confusion_matrix** WITH K= 7
[[123  29]
 [ 26 122]]
Errors : 55
**************************
**classification_report** WITH K= 8
              precision    recall  f1-score   support

           0       0.80      0.84      0.82       152
           1       0.83      0.78      0.80       148

    accuracy                           0.81       300
   macro avg       0.81      0.81      0.81       300
weighted avg       0.81      0.81      0.81       300

**confusion_matrix** WITH K= 8
[[128  24]
 [ 33 115]]
Errors : 57
**************************
**classification_report** WITH K= 9
              precision    recall  f1-score   support

           0       0.81      0.81      0.81       152
           1       0.81      0.81      0.81       148

    accuracy                           0.81       300
   macro avg       0.81      0.81      0.81       300
weighted avg       0.81      0.81      0.81       300

**confusion_matrix** WITH K= 9
[[123  29]
 [ 28 120]]
Errors : 57
**************************
**classification_report** WITH K= 10
              precision    recall  f1-score   support

           0       0.81      0.86      0.83       152
           1       0.84      0.79      0.82       148

    accuracy                           0.82       300
   macro avg       0.82      0.82      0.82       300
weighted avg       0.82      0.82      0.82       300

**confusion_matrix** WITH K= 10
[[130  22]
 [ 31 117]]
Errors : 53
**************************
**classification_report** WITH K= 11
              precision    recall  f1-score   support

           0       0.83      0.81      0.82       152
           1       0.81      0.82      0.82       148

    accuracy                           0.82       300
   macro avg       0.82      0.82      0.82       300
weighted avg       0.82      0.82      0.82       300

**confusion_matrix** WITH K= 11
[[123  29]
 [ 26 122]]
Errors : 55
**************************
**classification_report** WITH K= 12
              precision    recall  f1-score   support

           0       0.81      0.84      0.82       152
           1       0.83      0.79      0.81       148

    accuracy                           0.82       300
   macro avg       0.82      0.82      0.82       300
weighted avg       0.82      0.82      0.82       300

**confusion_matrix** WITH K= 12
[[128  24]
 [ 31 117]]
Errors : 55
**************************
**classification_report** WITH K= 13
              precision    recall  f1-score   support

           0       0.83      0.81      0.82       152
           1       0.81      0.82      0.82       148

    accuracy                           0.82       300
   macro avg       0.82      0.82      0.82       300
weighted avg       0.82      0.82      0.82       300

**confusion_matrix** WITH K= 13
[[123  29]
 [ 26 122]]
Errors : 55
**************************
**classification_report** WITH K= 14
              precision    recall  f1-score   support

           0       0.81      0.84      0.82       152
           1       0.83      0.80      0.82       148

    accuracy                           0.82       300
   macro avg       0.82      0.82      0.82       300
weighted avg       0.82      0.82      0.82       300

**confusion_matrix** WITH K= 14
[[127  25]
 [ 29 119]]
Errors : 54
**************************
**classification_report** WITH K= 15
              precision    recall  f1-score   support

           0       0.83      0.81      0.82       152
           1       0.81      0.83      0.82       148

    accuracy                           0.82       300
   macro avg       0.82      0.82      0.82       300
weighted avg       0.82      0.82      0.82       300

**confusion_matrix** WITH K= 15
[[123  29]
 [ 25 123]]
Errors : 54
**************************
**classification_report** WITH K= 16
              precision    recall  f1-score   support

           0       0.81      0.84      0.82       152
           1       0.83      0.80      0.82       148

    accuracy                           0.82       300
   macro avg       0.82      0.82      0.82       300
weighted avg       0.82      0.82      0.82       300

**confusion_matrix** WITH K= 16
[[127  25]
 [ 29 119]]
Errors : 54
**************************
**classification_report** WITH K= 17
              precision    recall  f1-score   support

           0       0.84      0.82      0.83       152
           1       0.82      0.84      0.83       148

    accuracy                           0.83       300
   macro avg       0.83      0.83      0.83       300
weighted avg       0.83      0.83      0.83       300

**confusion_matrix** WITH K= 17
[[124  28]
 [ 23 125]]
Errors : 51
**************************
**classification_report** WITH K= 18
              precision    recall  f1-score   support

           0       0.82      0.84      0.83       152
           1       0.83      0.82      0.82       148

    accuracy                           0.83       300
   macro avg       0.83      0.83      0.83       300
weighted avg       0.83      0.83      0.83       300

**confusion_matrix** WITH K= 18
[[127  25]
 [ 27 121]]
Errors : 52
**************************
**classification_report** WITH K= 19
              precision    recall  f1-score   support

           0       0.83      0.82      0.83       152
           1       0.82      0.82      0.82       148

    accuracy                           0.82       300
   macro avg       0.82      0.82      0.82       300
weighted avg       0.82      0.82      0.82       300

**confusion_matrix** WITH K= 19
[[125  27]
 [ 26 122]]
Errors : 53
**************************
**classification_report** WITH K= 20
              precision    recall  f1-score   support

           0       0.82      0.82      0.82       152
           1       0.82      0.81      0.81       148

    accuracy                           0.82       300
   macro avg       0.82      0.82      0.82       300
weighted avg       0.82      0.82      0.82       300

**confusion_matrix** WITH K= 20
[[125  27]
 [ 28 120]]
Errors : 55
**************************
**classification_report** WITH K= 21
              precision    recall  f1-score   support

           0       0.84      0.81      0.82       152
           1       0.81      0.84      0.82       148

    accuracy                           0.82       300
   macro avg       0.82      0.82      0.82       300
weighted avg       0.82      0.82      0.82       300

**confusion_matrix** WITH K= 21
[[123  29]
 [ 24 124]]
Errors : 53
**************************
**classification_report** WITH K= 22
              precision    recall  f1-score   support

           0       0.82      0.82      0.82       152
           1       0.81      0.82      0.81       148

    accuracy                           0.82       300
   macro avg       0.82      0.82      0.82       300
weighted avg       0.82      0.82      0.82       300

**confusion_matrix** WITH K= 22
[[124  28]
 [ 27 121]]
Errors : 55
**************************
**classification_report** WITH K= 23
              precision    recall  f1-score   support

           0       0.85      0.82      0.83       152
           1       0.82      0.85      0.83       148

    accuracy                           0.83       300
   macro avg       0.83      0.83      0.83       300
weighted avg       0.83      0.83      0.83       300

**confusion_matrix** WITH K= 23
[[124  28]
 [ 22 126]]
Errors : 50
**************************
**classification_report** WITH K= 24
              precision    recall  f1-score   support

           0       0.82      0.82      0.82       152
           1       0.82      0.82      0.82       148

    accuracy                           0.82       300
   macro avg       0.82      0.82      0.82       300
weighted avg       0.82      0.82      0.82       300

**confusion_matrix** WITH K= 24
[[125  27]
 [ 27 121]]
Errors : 54
**************************
**classification_report** WITH K= 25
              precision    recall  f1-score   support

           0       0.85      0.82      0.83       152
           1       0.82      0.85      0.83       148

    accuracy                           0.83       300
   macro avg       0.83      0.83      0.83       300
weighted avg       0.83      0.83      0.83       300

**confusion_matrix** WITH K= 25
[[124  28]
 [ 22 126]]
Errors : 50
**************************
**classification_report** WITH K= 26
              precision    recall  f1-score   support

           0       0.84      0.82      0.83       152
           1       0.82      0.84      0.83       148

    accuracy                           0.83       300
   macro avg       0.83      0.83      0.83       300
weighted avg       0.83      0.83      0.83       300

**confusion_matrix** WITH K= 26
[[125  27]
 [ 24 124]]
Errors : 51
**************************
**classification_report** WITH K= 27
              precision    recall  f1-score   support

           0       0.85      0.82      0.83       152
           1       0.82      0.85      0.83       148

    accuracy                           0.83       300
   macro avg       0.83      0.83      0.83       300
weighted avg       0.83      0.83      0.83       300

**confusion_matrix** WITH K= 27
[[124  28]
 [ 22 126]]
Errors : 50
**************************
**classification_report** WITH K= 28
              precision    recall  f1-score   support

           0       0.83      0.82      0.83       152
           1       0.82      0.83      0.83       148

    accuracy                           0.83       300
   macro avg       0.83      0.83      0.83       300
weighted avg       0.83      0.83      0.83       300

**confusion_matrix** WITH K= 28
[[125  27]
 [ 25 123]]
Errors : 52
**************************
**classification_report** WITH K= 29
              precision    recall  f1-score   support

           0       0.85      0.81      0.83       152
           1       0.81      0.86      0.84       148

    accuracy                           0.83       300
   macro avg       0.83      0.83      0.83       300
weighted avg       0.83      0.83      0.83       300

**confusion_matrix** WITH K= 29
[[123  29]
 [ 21 127]]
Errors : 50
**************************
**classification_report** WITH K= 30
              precision    recall  f1-score   support

           0       0.84      0.82      0.83       152
           1       0.82      0.84      0.83       148

    accuracy                           0.83       300
   macro avg       0.83      0.83      0.83       300
weighted avg       0.83      0.83      0.83       300

**confusion_matrix** WITH K= 30
[[124  28]
 [ 24 124]]
Errors : 52
**************************
**classification_report** WITH K= 31
              precision    recall  f1-score   support

           0       0.87      0.81      0.84       152
           1       0.82      0.87      0.84       148

    accuracy                           0.84       300
   macro avg       0.84      0.84      0.84       300
weighted avg       0.84      0.84      0.84       300

**confusion_matrix** WITH K= 31
[[123  29]
 [ 19 129]]
Errors : 48
**************************
**classification_report** WITH K= 32
              precision    recall  f1-score   support

           0       0.85      0.82      0.83       152
           1       0.82      0.85      0.83       148

    accuracy                           0.83       300
   macro avg       0.83      0.83      0.83       300
weighted avg       0.83      0.83      0.83       300

**confusion_matrix** WITH K= 32
[[124  28]
 [ 22 126]]
Errors : 50
**************************
**classification_report** WITH K= 33
              precision    recall  f1-score   support

           0       0.85      0.80      0.82       152
           1       0.81      0.85      0.83       148

    accuracy                           0.83       300
   macro avg       0.83      0.83      0.83       300
weighted avg       0.83      0.83      0.83       300

**confusion_matrix** WITH K= 33
[[122  30]
 [ 22 126]]
Errors : 52
**************************
**classification_report** WITH K= 34
              precision    recall  f1-score   support

           0       0.84      0.81      0.83       152
           1       0.81      0.84      0.83       148

    accuracy                           0.83       300
   macro avg       0.83      0.83      0.83       300
weighted avg       0.83      0.83      0.83       300

**confusion_matrix** WITH K= 34
[[123  29]
 [ 23 125]]
Errors : 52
**************************
**classification_report** WITH K= 35
              precision    recall  f1-score   support

           0       0.85      0.81      0.83       152
           1       0.81      0.85      0.83       148

    accuracy                           0.83       300
   macro avg       0.83      0.83      0.83       300
weighted avg       0.83      0.83      0.83       300

**confusion_matrix** WITH K= 35
[[123  29]
 [ 22 126]]
Errors : 51
**************************
**classification_report** WITH K= 36
              precision    recall  f1-score   support

           0       0.84      0.82      0.83       152
           1       0.82      0.84      0.83       148

    accuracy                           0.83       300
   macro avg       0.83      0.83      0.83       300
weighted avg       0.83      0.83      0.83       300

**confusion_matrix** WITH K= 36
[[125  27]
 [ 23 125]]
Errors : 50
**************************
**classification_report** WITH K= 37
              precision    recall  f1-score   support

           0       0.86      0.82      0.84       152
           1       0.82      0.86      0.84       148

    accuracy                           0.84       300
   macro avg       0.84      0.84      0.84       300
weighted avg       0.84      0.84      0.84       300

**confusion_matrix** WITH K= 37
[[125  27]
 [ 21 127]]
Errors : 48
**************************
**classification_report** WITH K= 38
              precision    recall  f1-score   support

           0       0.85      0.83      0.84       152
           1       0.83      0.84      0.84       148

    accuracy                           0.84       300
   macro avg       0.84      0.84      0.84       300
weighted avg       0.84      0.84      0.84       300

**confusion_matrix** WITH K= 38
[[126  26]
 [ 23 125]]
Errors : 49
**************************
**classification_report** WITH K= 39
              precision    recall  f1-score   support

           0       0.86      0.82      0.84       152
           1       0.82      0.86      0.84       148

    accuracy                           0.84       300
   macro avg       0.84      0.84      0.84       300
weighted avg       0.84      0.84      0.84       300

**confusion_matrix** WITH K= 39
[[125  27]
 [ 21 127]]
Errors : 48
**************************
**classification_report** WITH K= 40
              precision    recall  f1-score   support

           0       0.84      0.82      0.83       152
           1       0.82      0.84      0.83       148

    accuracy                           0.83       300
   macro avg       0.83      0.83      0.83       300
weighted avg       0.83      0.83      0.83       300

**confusion_matrix** WITH K= 40
[[125  27]
 [ 24 124]]
Errors : 51
**************************
**classification_report** WITH K= 41
              precision    recall  f1-score   support

           0       0.85      0.81      0.83       152
           1       0.81      0.86      0.84       148

    accuracy                           0.83       300
   macro avg       0.83      0.83      0.83       300
weighted avg       0.83      0.83      0.83       300

**confusion_matrix** WITH K= 41
[[123  29]
 [ 21 127]]
Errors : 50
**************************
**classification_report** WITH K= 42
              precision    recall  f1-score   support

           0       0.84      0.82      0.83       152
           1       0.82      0.84      0.83       148

    accuracy                           0.83       300
   macro avg       0.83      0.83      0.83       300
weighted avg       0.83      0.83      0.83       300

**confusion_matrix** WITH K= 42
[[125  27]
 [ 23 125]]
Errors : 50
**************************
**classification_report** WITH K= 43
              precision    recall  f1-score   support

           0       0.86      0.82      0.84       152
           1       0.82      0.86      0.84       148

    accuracy                           0.84       300
   macro avg       0.84      0.84      0.84       300
weighted avg       0.84      0.84      0.84       300

**confusion_matrix** WITH K= 43
[[124  28]
 [ 21 127]]
Errors : 49
**************************
**classification_report** WITH K= 44
              precision    recall  f1-score   support

           0       0.84      0.82      0.83       152
           1       0.82      0.84      0.83       148

    accuracy                           0.83       300
   macro avg       0.83      0.83      0.83       300
weighted avg       0.83      0.83      0.83       300

**confusion_matrix** WITH K= 44
[[124  28]
 [ 24 124]]
Errors : 52
**************************
**classification_report** WITH K= 45
              precision    recall  f1-score   support

           0       0.85      0.82      0.83       152
           1       0.82      0.85      0.83       148

    accuracy                           0.83       300
   macro avg       0.83      0.83      0.83       300
weighted avg       0.83      0.83      0.83       300

**confusion_matrix** WITH K= 45
[[124  28]
 [ 22 126]]
Errors : 50
**************************
**classification_report** WITH K= 46
              precision    recall  f1-score   support

           0       0.84      0.82      0.83       152
           1       0.82      0.84      0.83       148

    accuracy                           0.83       300
   macro avg       0.83      0.83      0.83       300
weighted avg       0.83      0.83      0.83       300

**confusion_matrix** WITH K= 46
[[125  27]
 [ 23 125]]
Errors : 50
**************************
**classification_report** WITH K= 47
              precision    recall  f1-score   support

           0       0.84      0.82      0.83       152
           1       0.82      0.84      0.83       148

    accuracy                           0.83       300
   macro avg       0.83      0.83      0.83       300
weighted avg       0.83      0.83      0.83       300

**confusion_matrix** WITH K= 47
[[124  28]
 [ 23 125]]
Errors : 51
**************************
**classification_report** WITH K= 48
              precision    recall  f1-score   support

           0       0.84      0.83      0.83       152
           1       0.83      0.84      0.83       148

    accuracy                           0.83       300
   macro avg       0.83      0.83      0.83       300
weighted avg       0.83      0.83      0.83       300

**confusion_matrix** WITH K= 48
[[126  26]
 [ 24 124]]
Errors : 50
**************************
**classification_report** WITH K= 49
              precision    recall  f1-score   support

           0       0.84      0.82      0.83       152
           1       0.82      0.84      0.83       148

    accuracy                           0.83       300
   macro avg       0.83      0.83      0.83       300
weighted avg       0.83      0.83      0.83       300

**confusion_matrix** WITH K= 49
[[124  28]
 [ 24 124]]
Errors : 52
**************************
**classification_report** WITH K= 50
              precision    recall  f1-score   support

           0       0.84      0.83      0.83       152
           1       0.83      0.84      0.83       148

    accuracy                           0.83       300
   macro avg       0.83      0.83      0.83       300
weighted avg       0.83      0.83      0.83       300

**confusion_matrix** WITH K= 50
[[126  26]
 [ 24 124]]
Errors : 50
**************************
**classification_report** WITH K= 51
              precision    recall  f1-score   support

           0       0.85      0.83      0.84       152
           1       0.83      0.84      0.84       148

    accuracy                           0.84       300
   macro avg       0.84      0.84      0.84       300
weighted avg       0.84      0.84      0.84       300

**confusion_matrix** WITH K= 51
[[126  26]
 [ 23 125]]
Errors : 49
**************************
**classification_report** WITH K= 52
              precision    recall  f1-score   support

           0       0.84      0.84      0.84       152
           1       0.83      0.84      0.84       148

    accuracy                           0.84       300
   macro avg       0.84      0.84      0.84       300
weighted avg       0.84      0.84      0.84       300

**confusion_matrix** WITH K= 52
[[127  25]
 [ 24 124]]
Errors : 49
**************************
**classification_report** WITH K= 53
              precision    recall  f1-score   support

           0       0.85      0.83      0.84       152
           1       0.83      0.85      0.84       148

    accuracy                           0.84       300
   macro avg       0.84      0.84      0.84       300
weighted avg       0.84      0.84      0.84       300

**confusion_matrix** WITH K= 53
[[126  26]
 [ 22 126]]
Errors : 48
**************************
**classification_report** WITH K= 54
              precision    recall  f1-score   support

           0       0.84      0.84      0.84       152
           1       0.84      0.84      0.84       148

    accuracy                           0.84       300
   macro avg       0.84      0.84      0.84       300
weighted avg       0.84      0.84      0.84       300

**confusion_matrix** WITH K= 54
[[128  24]
 [ 24 124]]
Errors : 48
**************************
**classification_report** WITH K= 55
              precision    recall  f1-score   support

           0       0.85      0.82      0.84       152
           1       0.82      0.85      0.84       148

    accuracy                           0.84       300
   macro avg       0.84      0.84      0.84       300
weighted avg       0.84      0.84      0.84       300

**confusion_matrix** WITH K= 55
[[125  27]
 [ 22 126]]
Errors : 49
**************************
**classification_report** WITH K= 56
              precision    recall  f1-score   support

           0       0.83      0.82      0.83       152
           1       0.82      0.83      0.83       148

    accuracy                           0.83       300
   macro avg       0.83      0.83      0.83       300
weighted avg       0.83      0.83      0.83       300

**confusion_matrix** WITH K= 56
[[125  27]
 [ 25 123]]
Errors : 52
**************************
**classification_report** WITH K= 57
              precision    recall  f1-score   support

           0       0.84      0.82      0.83       152
           1       0.82      0.84      0.83       148

    accuracy                           0.83       300
   macro avg       0.83      0.83      0.83       300
weighted avg       0.83      0.83      0.83       300

**confusion_matrix** WITH K= 57
[[125  27]
 [ 23 125]]
Errors : 50
**************************
**classification_report** WITH K= 58
              precision    recall  f1-score   support

           0       0.84      0.82      0.83       152
           1       0.82      0.84      0.83       148

    accuracy                           0.83       300
   macro avg       0.83      0.83      0.83       300
weighted avg       0.83      0.83      0.83       300

**confusion_matrix** WITH K= 58
[[125  27]
 [ 24 124]]
Errors : 51
**************************
**classification_report** WITH K= 59
              precision    recall  f1-score   support

           0       0.85      0.81      0.83       152
           1       0.81      0.85      0.83       148

    accuracy                           0.83       300
   macro avg       0.83      0.83      0.83       300
weighted avg       0.83      0.83      0.83       300

**confusion_matrix** WITH K= 59
[[123  29]
 [ 22 126]]
Errors : 51
Out[7]:
Text(0.5, 1.0, 'Error rate vs K value')

Decision Trees and Random Forest¶

¶

Entropy and Information Gain are the Mathematical Methods of choosing the best split.¶

image.png

In [5]:
######## Decision Trees and Random Forests ########
df = pd.read_csv('datas/kyphosis.csv')
#####################################
from sklearn.model_selection import train_test_split

X = df.drop('Kyphosis',axis=1)
y = df['Kyphosis']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier()

dtree.fit(X_train,y_train)

predictions = dtree.predict(X_test)

from sklearn.metrics import classification_report,confusion_matrix

print("**classification_report**",)
print(classification_report(y_test,predictions))
print("**confusion_matrix**")
print(confusion_matrix(y_test,predictions))
**classification_report**
              precision    recall  f1-score   support

      absent       0.88      0.75      0.81        20
     present       0.38      0.60      0.46         5

    accuracy                           0.72        25
   macro avg       0.63      0.68      0.64        25
weighted avg       0.78      0.72      0.74        25

**confusion_matrix**
[[15  5]
 [ 2  3]]
In [7]:
#####################################
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train, y_train)

rfc_pred = rfc.predict(X_test)

print("**classification_report**",)
print(classification_report(y_test,rfc_pred))
print("**confusion_matrix**")
print(confusion_matrix(y_test,rfc_pred))
**classification_report**
              precision    recall  f1-score   support

      absent       0.86      0.95      0.90        20
     present       0.67      0.40      0.50         5

    accuracy                           0.84        25
   macro avg       0.77      0.68      0.70        25
weighted avg       0.82      0.84      0.82        25

**confusion_matrix**
[[19  1]
 [ 3  2]]
In [ ]:
######## Support Vector Machines (SVM) ########
In [ ]:
 
In [ ]:
#####################################
#####################################
# Check predictions results
#####################################
#####################################

Classification Error Metrics¶

  • True positives (TP)
  • True negatives (TN)
  • False positives (FP) *[Type I error]* (A man is pregnant)
  • False negatives (FN) *[Type II error]* (A pregnant woman isn't pregnant)

image.png

In [7]:
Image(filename='imgs/confusionMatrix.JPG')
Out[7]:

Accuracy¶

  • Overall, how often is it correct ?
  • (correct predictions) / (total predictions)
  • (TP + TN) / total

= 150 /165 = 0.91

Accuracy is useful when target classes are well balanced, BUT not a good choice with **unbalanced** classes!

Misclassification Rate (Error Rate)¶

  • Overall, how often is it wrong ?
  • (wrong predictions) / (total predictions)
  • (FP + FN) / total

= 15 /165 = 0.09

Recall¶

  • Ability of a model to find all the relevant cases within a dataset.

TP / (TP + FN)
= 100 / 105 = 0.95

Precision¶

  • Proportion data points were relevant that our model says was relevant

TP / (TP + FP)
= 100 / 110 = 0.91

Recall & Precision¶

While recall expresses the ability to find all relevant instances in a dataset
precision expresses the proportion of the data points our model says was relevant actually were relevant.

F1-Score¶

  • "harmonic mean" of Precision and Recall ==> punish extrem values
$${F}_1 = 2 * \frac {precision * recall} {precision + recall}$$

image.png

In [9]:
Image('imgs/confusionMatrixFormulas.png')
Out[9]:

Regression Evaluation Metrics¶

Here are three common evaluation metrics for regression problems:

Mean Absolute Error (MAE) is the mean of the absolute value of the errors:

$$\frac 1n\sum_{i=1}^n|y_i-\hat{y}_i|$$

(somme des différences (en valeur absolue) entre valeur prédite et valeur réelle / (nombre de prédictions)
==> large errors not punished)

Mean Squared Error (MSE) is the mean of the squared errors:

$$\frac 1n\sum_{i=1}^n(y_i-\hat{y}_i)^2$$

(somme des {différences (en valeur absolue) entre valeur prédite et valeur réelle}^2 / (nombre de prédictions)
==> unité -> unité^2)

Root Mean Squared Error (RMSE) is the square root of the mean of the squared errors:

$$\sqrt{\frac 1n\sum_{i=1}^n(y_i-\hat{y}_i)^2}$$

RacineCarré[somme des {différences* entre valeur prédite et valeur réelle}^2 / (nombre de prédictions)]

Comparing these metrics:

  • MAE is the easiest to understand, because it's the average error.
  • MSE is more popular than MAE, because MSE "punishes" larger errors, which tends to be useful in the real world.
  • RMSE is even more popular than MSE, because RMSE is interpretable in the "y" units.

All of these are loss functions, because we want to minimize them.

In [100]:
from sklearn.metrics import classification_report, confusion_matrix
#from sklearn.metrics import confusion_matrix


print("**classification_report**",)
print(classification_report(y_test,predictions))
print("**confusion_matrix**")
print(confusion_matrix(y_test,predictions))
**classification_report**
              precision    recall  f1-score   support

           0       0.83      0.90      0.86       163
           1       0.82      0.71      0.76       104

    accuracy                           0.83       267
   macro avg       0.83      0.81      0.81       267
weighted avg       0.83      0.83      0.83       267

**confusion_matrix**
[[147  16]
 [ 30  74]]
In [ ]: